#coding=utf-8
'''
Two loop version took 23.252080 seconds
One loop version took 51.822145 seconds
No loop version took 0.592002 seconds
'''
import numpy as np
class KNearestNeightbor(object):
    def __init__(self):
        pass
    def train(self, X, y):
        self.X_train = X
        self.y_train = y
    def predict(self, X, k=1, num_loops=0):
        if num_loops == 0:
            dists = self.compute_distance_no_loops(X)
        elif num_loops == 1:
            dists = self.compute_distance_one_loop(X)
        elif num_loops == 2:
            dists = self.compute_distance_two_loops(X)
        else:
            raise ValueError('Invalid value %d for num_loops'%num_loops)
        return self.predict_labels(dists, k=k)
    #用两个for循环计算L2距离
    def compute_distance_two_loops(self, X):
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train))
        for i in range(num_test):
            for j in range(num_train):
                distances = np.sqrt(np.sum(np.square(self.X_train[j] - X[i])))
                dists[i, j] = dists
        return dists
    #用一个for循环计算L2距离
    def compute_distance_one_loop(self, X):
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train))
        for i in range(num_test):
            distances = np.sqrt(np.sum(np.square(self.X_train - X[i]), axis=1))
            dists[i, :] = distances
        return dists
    #不使用for循环计算距离
    def compute_distance_no_loops(self, X):
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dists = np.zeros((num_test, num_train))
        '''
        M = np.dot(X, self.X_train.T)计算出了(x - y)^2 = x^2 + y^2 - 2*x*y中的x*y。其中矩阵维度为(m1, m2)。
        te = np.diag(np.dot(X, X.T))取出了X * X.T的对角线，元素为每个测试样本的各个维度平方的和。tr亦然，只是主体为训练样本。te、tr是一个行向量，te维度为m1，tr维度为m2
        te = np.reshape(np.repeat(te, ncol), M.shape)这行代码，首先将te的每个元素重复m2次，并reshape为(m1, m2)。每一行元素都是相等的，第i行的元素为第i个测试样本的平方和。te计算出了x^2。
        tr = np.reshape(np.repeat(tr, nrow), M.T.shape)亦然，只是重复了m1次。tr的shape为(m2, m1)。tr.T的每行元素的第j个元素为第j个训练样本的平方和。tr.T各行内容重复。tr.T计算出了y^2。
        sq = -2 * M + te + tr.T计算出了(x - y)^2 = x^2 + y^2 - 2*x*y
        '''
        M = np.dot(X, self.X_train.T)
        nrow = M.shape[0]
        ncol = M.shape[1]
        te = np.diag(np.dot(X, X.T))
        tr = np.diag(np.dot(self.X_train, self.X_train.T))
        te = np.reshape(np.repeat(te, ncol), M.shape)
        tr = np.reshape(np.repeat(tr, nrow), M.T.shape)
        sq = -2 * M + te + tr.T
        return dists
    #预测标签
    def predict_labels(self, dists, k=1):
        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)
        for i in range(num_test):
            closest_y = []
            distances = dists[i, :]
            indexes = np.argsort(distances)
            closest_y = self.y_train[indexes[:k]]
            count = np.bincount(closest_y) #统计所有数字的出现次数
            y_pred[i] = np.argmax(count) #投票函数
        return y_pred

